{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly\n",
    "import plotly.graph_objs as go\n",
    "plotly.offline.init_notebook_mode()\n",
    "import os\n",
    "import csv\n",
    "from tqdm import tqdm\n",
    "import random\n",
    "\n",
    "DATA_DIR = \"../data/\"\n",
    "SELECTED_DATA_DIR = \"../selected-data/\"\n",
    "CSV_DIR = \"csv/\"\n",
    "MOVIES_FILE = \"imdb_database.json\"\n",
    "MOVIES_FILE_OUT = \"best_movie_ratings_features.csv\"\n",
    "USERS_FILE_OUT = \"users_ratings.csv\"\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Subset of movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def flatten_aka(d):\n",
    "    try:\n",
    "        return [x['name'] for x in d]\n",
    "    except:\n",
    "        return []\n",
    "\n",
    "movies = pd.read_json(DATA_DIR + MOVIES_FILE)\n",
    "movies = movies[movies.kind == \"movie\"]\n",
    "movies[\"title\"] = movies.id\n",
    "movies[\"votes\"] = movies.rating.apply(lambda x: x['votes'])\n",
    "movies[\"rating\"] = movies.rating.apply(lambda x: x['rank'])\n",
    "movies[\"aka\"] = movies.aka.apply(flatten_aka)\n",
    "movies = movies.drop_duplicates(subset=\"title\")\n",
    "movies = movies[[\"aka\", \"genres\", \"year\", \"votes\", \"rating\", \"title\"]].set_index(\"title\").dropna()\n",
    "movies.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "movies = movies.sort_values(by=\"votes\", ascending=False)\n",
    "movies = movies.loc[movies.index[:1000]]\n",
    "movies.sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "movies.to_csv(SELECTED_DATA_DIR + MOVIES_FILE_OUT)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Users with at least N recommandations for these movies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "N = 50\n",
    "users = pd.DataFrame()\n",
    "user_files = os.listdir(DATA_DIR + CSV_DIR)\n",
    "random.shuffle(user_files)\n",
    "user_files = user_files[:10000]\n",
    "for (i, filename) in tqdm(enumerate(user_files)):\n",
    "    user = pd.read_csv(DATA_DIR + CSV_DIR + filename, header=-1)\n",
    "    user = user.set_index(1)\n",
    "    count = user.join(movies, how='inner', lsuffix=\"user_\", rsuffix=\"movie_\").size\n",
    "    if count > 50:\n",
    "        # get only recommandations that are in movies\n",
    "        users = users.append(user[user.index.isin(movies.index)])\n",
    "\n",
    "users.columns = [\"user\", \"rating\", \"link\"]\n",
    "users.index.rename(\"movie\", inplace=True)\n",
    "users.sample()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#for id, recomm in tqdm(users.iterrows()):\n",
    "#    assert(id in movies.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "users.to_csv(SELECTED_DATA_DIR + USERS_FILE_OUT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
